Aging_Mice_GSE129788 Analysis
PAPER
“Single-cell transcriptomic profiling of the aging mouse brain” Isolated single cell from mice brain to perform single cell RNAseq 8 young and 8 old mice snRNAseq samples
#---------- Knitr
setwd("/Volumes/projects_secondary/bras/Lee/ASAP/Aging_Mice_GSE129788")
knitr::opts_chunk$set(warning=FALSE, message=FALSE, cache=FALSE, error=FALSE, cache.lazy=FALSE)
#knitr::opts_knit$set(root.dir='/Volumes/projects_secondary/bras/Lee/ASAP/Aging_Mice_GSE129788')
#options(scipen=999)
#options(stringsAsFactors = FALSE)
# Chunk Options
# include = FALSE, runs code, prevents code and results from appearing
# echo = FALSE, runs code, prevents code from appearing but not the results
# eval = FALSE, does not run code
# message = FALSE prevents messages that are generated by code from appearing
# warning = FALSE prevents warnings that are generated by code from appearing
# fig.cap = "..." adds a caption to graphical results.
#---------- Packages
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("fgsea")
#install.packages('metap')
suppressPackageStartupMessages({
library(betareg)
library(biomaRt)
library(corrplot)
library(cowplot)
library(DoubletFinder)
library(dplyr)
library(DT)
library(fgsea)
library(ggplot2)
library(ggpubr)
library(gprofiler2)
library(knitr)
library(kableExtra)
library(lubridate)
library(magrittr)
library(MAST)
library(metap)
library(multtest)
library(parallel)
library(purrr)
library(RColorBrewer)
library(readr)
library(reshape2)
library(reticulate)
library(rmarkdown)
library(Seurat)
library(stringr)
library(tibble)
library(tidyr)
library(viridis)
})
## Warning: package 'GenomeInfoDb' was built under R version 4.0.4
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] viridis_0.5.1 viridisLite_0.3.0
## [3] tidyr_1.1.3 tibble_3.1.0
## [5] stringr_1.4.0 SeuratObject_4.0.0
## [7] Seurat_4.0.0 rmarkdown_2.7
## [9] reticulate_1.18 reshape2_1.4.4
## [11] readr_1.4.0 RColorBrewer_1.1-2
## [13] purrr_0.3.4 multtest_2.46.0
## [15] metap_1.4 MAST_1.16.0
## [17] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [19] Biobase_2.50.0 GenomicRanges_1.42.0
## [21] GenomeInfoDb_1.26.4 IRanges_2.24.1
## [23] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [25] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [27] magrittr_2.0.1 lubridate_1.7.10
## [29] kableExtra_1.3.4 knitr_1.31
## [31] gprofiler2_0.2.0 ggpubr_0.4.0
## [33] ggplot2_3.3.3 fgsea_1.16.0
## [35] DT_0.17 dplyr_1.0.5
## [37] DoubletFinder_2.0.3 cowplot_1.1.1
## [39] corrplot_0.84 biomaRt_2.46.3
## [41] betareg_3.1-4
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 tidyselect_1.1.0 RSQLite_2.2.4
## [4] AnnotationDbi_1.52.0 htmlwidgets_1.5.3 grid_4.0.3
## [7] BiocParallel_1.24.1 Rtsne_0.15 munsell_0.5.0
## [10] ica_1.0-2 codetools_0.2-18 mutoss_0.1-12
## [13] miniUI_0.1.1.1 future_1.21.0 withr_2.4.1
## [16] colorspace_2.0-0 rstudioapi_0.13 ROCR_1.0-11
## [19] tensor_1.5 ggsignif_0.6.1 listenv_0.8.0
## [22] Rdpack_2.1.1 GenomeInfoDbData_1.2.4 polyclip_1.10-0
## [25] mnormt_2.0.2 bit64_4.0.5 parallelly_1.24.0
## [28] vctrs_0.3.6 generics_0.1.0 TH.data_1.0-10
## [31] xfun_0.22 BiocFileCache_1.14.0 R6_2.5.0
## [34] flexmix_2.3-17 spatstat.utils_2.1-0 bitops_1.0-6
## [37] cachem_1.0.4 DelayedArray_0.16.2 assertthat_0.2.1
## [40] promises_1.2.0.1 scales_1.1.1 multcomp_1.4-16
## [43] nnet_7.3-15 gtable_0.3.0 globals_0.14.0
## [46] goftest_1.2-2 sandwich_3.0-0 rlang_0.4.10
## [49] systemfonts_1.0.1 splines_4.0.3 rstatix_0.7.0
## [52] lazyeval_0.2.2 broom_0.7.5 yaml_2.2.1
## [55] abind_1.4-5 backports_1.2.1 httpuv_1.5.5
## [58] tools_4.0.3 ellipsis_0.3.1 jquerylib_0.1.3
## [61] ggridges_0.5.3 TFisher_0.2.0 Rcpp_1.0.6
## [64] plyr_1.8.6 progress_1.2.2 zlibbioc_1.36.0
## [67] RCurl_1.98-1.2 prettyunits_1.1.1 deldir_0.2-10
## [70] rpart_4.1-15 openssl_1.4.3 pbapply_1.4-3
## [73] zoo_1.8-9 ggrepel_0.9.1 haven_2.3.1
## [76] cluster_2.1.1 scattermore_0.7 data.table_1.14.0
## [79] openxlsx_4.2.3 lmtest_0.9-38 RANN_2.6.1
## [82] tmvnsim_1.0-2 mvtnorm_1.1-1 fitdistrplus_1.1-3
## [85] patchwork_1.1.1 xtable_1.8-4 mime_0.10
## [88] hms_1.0.0 evaluate_0.14 XML_3.99-0.5
## [91] rio_0.5.26 readxl_1.3.1 gridExtra_2.3
## [94] compiler_4.0.3 KernSmooth_2.23-18 crayon_1.4.1
## [97] htmltools_0.5.1.1 mgcv_1.8-34 later_1.1.0.1
## [100] Formula_1.2-4 DBI_1.1.1 dbplyr_2.1.0
## [103] MASS_7.3-53.1 rappdirs_0.3.3 Matrix_1.3-2
## [106] car_3.0-10 rbibutils_2.0 igraph_1.2.6
## [109] forcats_0.5.1 pkgconfig_2.0.3 sn_1.6-2
## [112] numDeriv_2016.8-1.1 foreign_0.8-81 plotly_4.9.3
## [115] xml2_1.3.2 svglite_2.0.0 bslib_0.2.4
## [118] webshot_0.5.2 XVector_0.30.0 rvest_1.0.0
## [121] digest_0.6.27 sctransform_0.3.2 RcppAnnoy_0.0.18
## [124] spatstat.data_2.0-0 leiden_0.3.7 cellranger_1.1.0
## [127] fastmatch_1.1-0 uwot_0.1.10 curl_4.3
## [130] shiny_1.6.0 modeltools_0.2-23 nlme_3.1-152
## [133] lifecycle_1.0.0 jsonlite_1.7.2 carData_3.0-4
## [136] askpass_1.1 fansi_0.4.2 pillar_1.5.1
## [139] lattice_0.20-41 fastmap_1.1.0 httr_1.4.2
## [142] plotrix_3.8-1 survival_3.2-9 glue_1.4.2
## [145] spatstat_1.64-1 zip_2.1.1 png_0.1-7
## [148] bit_4.0.4 stringi_1.5.3 sass_0.3.1
## [151] blob_1.2.1 memoise_2.0.0 mathjaxr_1.4-0
## [154] irlba_2.3.3 future.apply_1.7.0
# library(reticulate)
# use_condaenv("base")
# use_python("/Users/lee.marshall/opt/anaconda3/bin/python", required = TRUE)
# display python config
# print(py_config())
# display module search path
# sys <- import("sys")
# sys$path
# py_module_available("scanpy")
GSE129788 fastq files only contain single forward read
GEO advised me to download bams from sra run selector - run - data access
Used network pc to download directly to hpc
bamtofastq to recreate fastq files
merge fastq files per sample prior to cellranger
cellranger-6.0.1 to align and count
performed mRNA alignment for standard single cell counts
also performed mRNA and premRNA (introns) counts for trajectory analysis
#--------- bamtofastq
qsubs.sh -n bamtofastq_cellranger -c 10 -a names.txt -s "bamtofastq --nthreads=10 --reads-per-fastq=1000000000 SAMPLE.bam SAMPLE_fastq"
#--------- Merge fastq files
qsubs.sh -a names.txt -n fastq_merge -s "cat SAMPLE_fastq/*/*R1_001.fastq.gz > SAMPLE_S1_L001_R1_001.fastq.gz"
qsubs.sh -a names.txt -n fastq_merge -s "cat SAMPLE_fastq/*/*R2_001.fastq.gz > SAMPLE_S1_L001_R2_001.fastq.gz"
qsubs.sh -a names.txt -n fastq_merge -s "cat SAMPLE_fastq/*/*I1_001.fastq.gz > SAMPLE_S1_L001_I1_001.fastq.gz"
#---------- cellranger_mRNA
module load cellranger/cellranger-6.0.1
qsubs.sh -n cellranger_count -c 4 -w 48 -a names.txt -s "cellranger count --sample=SAMPLE --id=SAMPLE_count --localcores=4 --fastqs=./fastq --chemistry=SC3Pv2 --transcriptome=/home/lee.marshall/bras_primary/software/cellranger_v6/refdata-gex-mm10-2020-A"
#---------- cellranger_mRNA_premRNA
module load cellranger/cellranger-6.0.1
qsubs.sh -n cellranger_count -c 4 -w 48 -a names.txt -s "cellranger count --sample=SAMPLE --id=SAMPLE_count --localcores=4 --fastqs=./fastq --chemistry=SC3Pv2 --include-introns --transcriptome=/home/lee.marshall/bras_primary/software/cellranger_v6/refdata-gex-mm10-2020-A"
Quality control graphs used to identify filtering thresholds
Senescence Cell not selectively removed
#---------- Seurat_Object
#---- Counts
dir <- "cellranger_mRNA/"
name <- list.files(path = "cellranger_mRNA") %>%
gsub(pattern = "_count", replacement = "")
#---- Count Matrix
seurat_counts <- mclapply(name, function(i){
count <- Read10X(data.dir=paste0(dir,i,"_count/outs/filtered_feature_bc_matrix/"),
gene.column=2, # col1 Ensemble, col2 Symbol
unique.features=TRUE,
strip.suffix=TRUE)
seurat_obj <- CreateSeuratObject(counts=count,
project=i,
min.cells=10,
min.genes=200)
})
#---- Merge Seurat Objects
seurat_combined <- merge(x = seurat_counts[[1]],
y = seurat_counts[2:length(seurat_counts)],
add.cell.ids = unlist(name),
project="Aging_Mice_GSE129788")
#---------- Add Metadata
# View(seurat_combined@meta.data)
# orig.ident: contains the sample identity if known
# nCount_RNA: number of UMIs per cell
# nFeature_RNA: number of genes detected per cell
#---- Create metadata dataframe
metadata <- seurat_combined@meta.data
#---- Add cell IDs to metadata
metadata$cells <- rownames(metadata)
#---- Rename columns
names(metadata)[1:3] <- c("name","nUMI","nGene")
#---- Add number of genes per UMI for each cell to metadata
metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)
#---- Compute percent mito ratio
metadata <- cbind(metadata,
PercentageFeatureSet(object = seurat_combined, pattern = "^mt-") / 100)
names(metadata)[6] <- "mitoRatio"
#---- Create Sample
metadata <- metadata %>%
left_join(data.frame(name = unique(metadata$name),
sample = c("Old1","Old2","Old3","Old4","Old5","Old6","Old7","Old8",
"Young1","Young2","Young3","Young4","Young5","Young6",
"Young7","Young8"),
group = rep(c("Old","Young"),each=8)), by = "name")
#---- Add metadata back to Seurat object
rownames(metadata) <- metadata$cells
seurat_combined@meta.data <- metadata
#---- Senescence Markers, p16 = Cdkn2a, p21 = Cdkn1a
seurat_combined$Cdkn2a_Count <-
seurat_combined@assays$RNA@counts[rownames(seurat_combined) == "Cdkn2a",]
seurat_combined$Cdkn1a_Count <-
seurat_combined@assays$RNA@counts[rownames(seurat_combined) == "Cdkn1a",]
#---- rm data
rm(seurat_counts, metadata)
#---- save rds
saveRDS(seurat_combined, file="seurat_results/seurat_combined.rds")
#---------- Cell_Counts
#----- Load data
seurat_combined <- readRDS(file = "seurat_results/seurat_combined.rds")
#----- Visualize the number of cell counts per sample
seurat_combined@meta.data %>%
ggplot(aes(x=sample, fill=sample)) +
geom_bar() +
ggtitle("Number of Cells") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold"))
#---------- UMI_Counts_per_cell (transcripts)
#----- Visualize the number UMIs/transcripts per cell
seurat_combined@meta.data %>%
ggplot(aes(color=sample, x=nUMI, fill= sample)) +
geom_density(alpha = 0.2) +
geom_vline(xintercept = 500, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") +
ggtitle("Number of UMIs/Transcripts per Cell") +
ylab("Cell density") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold"))
#---------- Genes_per_cell
#----- Visualize the distribution of genes detected per cell via histogram
p1 <- seurat_combined@meta.data %>%
ggplot(aes(color=sample, x=nGene, fill= sample)) +
geom_density(alpha = 0.2) +
geom_vline(xintercept = 250, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 5000, linetype="dashed", colour = "red3") +
ggtitle("Number of Genes per Cell") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold"))
#----- Visualize the distribution of genes detected per cell via boxplot
p2 <- seurat_combined@meta.data %>%
ggplot(aes(x=sample, y=log10(nGene), fill=sample)) +
geom_boxplot() +
ggtitle("Number of Genes vs Number of Cells") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold"))
#----- Plots
cowplot::plot_grid(p1, p2, ncol=2)
#---------- UMI_Counts_vs_Genes
#----- Visualize the correlation between genes detected and number of UMIs/transcripts
# determine whether strong presence of cells with low numbers of genes/UMIs
seurat_combined@meta.data %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point(size=0.1) +
geom_vline(xintercept = 500, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") +
geom_hline(yintercept = 250, linetype="dashed", colour = "red3") +
geom_hline(yintercept = 5000, linetype="dashed", colour = "red3") +
ggtitle("Number of UMIs/Transcripts vs Number of Cells") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
scale_colour_viridis_c() +
facet_wrap(~sample)
#---------- Mitochondrial_counts_ratio
#----- Visualize the distribution of mitochondrial gene expression detected per cell
seurat_combined@meta.data %>%
ggplot(aes(color=sample, x=mitoRatio, fill=sample)) +
geom_density(alpha = 0.2) +
geom_vline(xintercept = 0.20, linetype="dashed", colour = "red3") +
ggtitle("Mitochondrial Counts Ratio") +
ylab("Cell density") +
scale_x_log10() +
theme_classic()
#---------- Complexity
#----- Visualize the overall complexity of the gene expression by visualizing
# the genes detected per UMI
seurat_combined@meta.data %>%
ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
geom_density(alpha = 0.2) +
geom_vline(xintercept = 0.85, linetype="dashed", colour = "red3") +
ggtitle("Complexity Genes per UMI") +
ylab("Cell density") +
theme_classic()
#---------- Senescence Markers
#----- Senescence marker vs Number of UMIs/Transcripts
seurat_combined@meta.data %>%
ggplot(aes(x=nUMI, y=Cdkn2a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 500, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") +
ggtitle("Cdkn2a Counts vs Number of UMIs/Transcripts") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
seurat_combined@meta.data %>%
ggplot(aes(x=nUMI, y=Cdkn1a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 500, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 10000, linetype="dashed", colour = "red3") +
ggtitle("Cdkn1a Counts vs Number of UMIs/Transcripts") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
#----- Senescence marker vs Number of Cells
seurat_combined@meta.data %>%
ggplot(aes(x=nGene, y=Cdkn2a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 250, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 5000, linetype="dashed", colour = "red3") +
ggtitle("Cdkn2a Counts vs Number of Cells") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
seurat_combined@meta.data %>%
ggplot(aes(x=nGene, y=Cdkn1a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 250, linetype="dashed", colour = "red3") +
geom_vline(xintercept = 5000, linetype="dashed", colour = "red3") +
ggtitle("Cdkn1a Counts vs Number of Cells") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
#----- Senescence marker vs mitoRatio
seurat_combined@meta.data %>%
ggplot(aes(x=mitoRatio, y=Cdkn2a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 0.20, linetype="dashed", colour = "red3") +
ggtitle("Cdkn2a Counts vs Mitochondrial Counts Ratio") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
seurat_combined@meta.data %>%
ggplot(aes(x=mitoRatio, y=Cdkn1a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 0.20, linetype="dashed", colour = "red3") +
ggtitle("Cdkn1a Counts vs Mitochondrial Counts Ratio") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
#----- Senescence marker vs Complexity Genes per UMI
seurat_combined@meta.data %>%
ggplot(aes(x=log10GenesPerUMI, y=Cdkn2a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 0.85, linetype="dashed", colour = "red3") +
ggtitle("Cdkn2a Counts vs Complexity Genes per UMI") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
seurat_combined@meta.data %>%
ggplot(aes(x=log10GenesPerUMI, y=Cdkn1a_Count, color=sample)) +
geom_point(size=0.5) +
geom_vline(xintercept = 0.85, linetype="dashed", colour = "red3") +
ggtitle("Cdkn1a Counts vs Complexity Genes per UMI") +
scale_x_log10() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~sample)
Remove poor quilty cells and doublets
Keep high quality cells without removing biologically relevant cells
* nUMI >= 500
* nUMI <= 10000
* nGene >= 250
* nGene <= 5000
* mitoRatio < 0.20
* log10GenesPerUMI > 0.85
#---------- Seurat_Filtering
# Filter out low quality reads using selected thresholds - these will change with experiment
seurat_filtered <- subset(x = seurat_combined,
subset = (nUMI >= 500) &
(nUMI <= 10000) &
(nGene >= 250) &
(nGene <= 5000) &
(mitoRatio < 0.20) &
(log10GenesPerUMI > 0.85))
# seurat_filtered <- seurat_combined@meta.data %>%
# filter(nUMI >= 500 &
# nUMI <= 20000 &
# nGene >= 250 &
# nGene <= 10000 &
# mitoRatio < 0.20 &
# log10GenesPerUMI > 0.85)
# seurat_filtered <- subset(seurat_combined, cells = seurat_filtered$cells)
#---- save rds
saveRDS(seurat_filtered, file="seurat_results/seurat_filtered.rds")
#---------- Cell_Counts_Pre_and_Post_Filtered
#----- Load data
seurat_filtered <- readRDS(file = "seurat_results/seurat_filtered.rds")
#----- Number of Cells
cbind(seurat_combined@meta.data$sample %>%
table() %>%
as.data.frame() %>%
dplyr::rename(Sample="."),
seurat_filtered@meta.data$sample %>%
table() %>%
as.data.frame() %>%
dplyr::rename(Sample=".")) %>%
kable(caption="Number of Cells", format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered")) %>%
add_header_above(c("Pre-Filtered" = 2, "Post-Filtered" = 2))
| Sample | Freq | Sample | Freq |
|---|---|---|---|
| Old1 | 2,826 | Old1 | 2,174 |
| Old2 | 2,264 | Old2 | 1,799 |
| Old3 | 6,437 | Old3 | 5,154 |
| Old4 | 6,035 | Old4 | 4,668 |
| Old5 | 3,293 | Old5 | 2,639 |
| Old6 | 3,395 | Old6 | 2,607 |
| Old7 | 3,662 | Old7 | 3,157 |
| Old8 | 4,114 | Old8 | 3,606 |
| Young1 | 2,433 | Young1 | 1,546 |
| Young2 | 3,049 | Young2 | 2,082 |
| Young3 | 3,067 | Young3 | 2,305 |
| Young4 | 4,077 | Young4 | 2,623 |
| Young5 | 3,233 | Young5 | 2,276 |
| Young6 | 3,547 | Young6 | 2,235 |
| Young7 | 5,226 | Young7 | 2,872 |
| Young8 | 6,405 | Young8 | 2,891 |
#---------- Violin_plots
cowplot::plot_grid(VlnPlot(object = seurat_combined,
features = c("nGene", "nUMI", "mitoRatio", "log10GenesPerUMI"),
ncol = 1,
pt.size=0,
group.by="sample"),
VlnPlot(object = seurat_filtered,
features = c("nGene", "nUMI", "mitoRatio", "log10GenesPerUMI"),
ncol = 1,
pt.size=0,
group.by="sample"),
labels=c("Pre-Filtered", "Post-Filtered"), ncol = 2)
#---------- Scatter_plots
cowplot::plot_grid(FeatureScatter(object = seurat_combined,
feature1 = "nGene",
feature2 = "nUMI",
pt.size=0.01,
group.by="sample"),
FeatureScatter(object = seurat_filtered,
feature1 = "nGene",
feature2 = "nUMI",
pt.size=0.01,
group.by="sample"),
FeatureScatter(object = seurat_combined,
feature1 = "nGene",
feature2 = "mitoRatio",
pt.size=0.01,
group.by="sample"),
FeatureScatter(object = seurat_filtered,
feature1 = "nGene",
feature2 = "mitoRatio",
pt.size=0.01,
group.by="sample"),
labels=c("Pre-Filtered", "Post-Filtered"), nrow=2, ncol=2)
#---- rm data
rm(seurat_combined)
Determine if cell cycle regression is required
#---------- Cell_Cycle_Scoring
#----- Normalize the counts
seurat_phase <- NormalizeData(seurat_filtered)
#----- Load cell cycle markers
load(file="seurat_results/seurat_cycle.rda")
#----- Convert human to mouse genes
Human2mouseGenes <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"),
filters = "hgnc_symbol",
values = x ,
mart = human,
attributesL = c("mgi_symbol"),
martL = mouse,
uniqueRows=T)
converted <- unique(genesV2[, 2])
return(converted)
}
m_g2m_genes <- Human2mouseGenes(g2m_genes)
m_s_genes <- Human2mouseGenes(s_genes)
#----- Identify the most variable genescbin
seurat_phase <- FindVariableFeatures(seurat_phase,
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE)
#----- Scale the counts
seurat_phase <- ScaleData(seurat_phase)
#----- Perform PCA
seurat_phase <- RunPCA(seurat_phase)
#----- Score cells for cell cycle
seurat_phase <- CellCycleScoring(seurat_phase,
g2m.features = m_g2m_genes,
s.features = m_s_genes)
#---- save rds
saveRDS(seurat_phase, file="seurat_results/seurat_phase.rds")
#---------- PCA
#----- Load data
seurat_phase <- readRDS(file = "seurat_results/seurat_phase.rds")
#----- Plot the PCA colored by cell cycle phase
DimPlot(seurat_phase,
reduction = "pca",
group.by= "Phase",
split.by = "Phase")
# you can regress out the s and g2m score
# alternatively you could only regress out the s-g2m score to keep
# non-cyling and cycling cell differences
DimPlot(seurat_phase,
reduction = "pca",
group.by= "Phase",
split.by = "group")
#----- Number of Cell Cycle Phase Cells
seurat_phase@meta.data %>%
dplyr::group_by(group) %>%
dplyr::count(Phase) %>%
kable(caption="Number of Cell Cycle Phase Cells", format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| group | Phase | n |
|---|---|---|
| Old | G1 | 8,878 |
| Old | G2M | 8,379 |
| Old | S | 8,547 |
| Young | G1 | 6,646 |
| Young | G2M | 6,062 |
| Young | S | 6,122 |
#----- Remove data
rm(seurat_phase)
Standard batch correction is not considered standard anymore
SCTransform batch correction allows for better normalization
SCTransform method models the UMI counts using a regularized negative binomial
model to remove the variation due to sequencing depth (total nUMIs per cell),
while adjusting the variance based on pooling information across genes with
similar abundances (similar to some bulk RNA-seq methods).
If seurat_integrated has a memory error, run on the HPC
conda env create –file r3.6_seurat.yml
name: r3.6_seurat
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- conda-forge::r-essentials=3.6
- conda-forge::libblas=3.8.0=14_mkl
- openblas
- r-doparallel
- conda-forge::r-seurat=3.2.3
conda activate r3.6_seurat
#---------- Batch_Correction_SCTransform
# The data is preprocessed by splitting into individual ‘assays’.
# In this experiment, each individual sample is considered an ‘assay’.
# Ensure the dataset used here is the non-normalized dataset
DefaultAssay(seurat_filtered) <- "RNA"
#---------- Normalizing the data
# global-scaling normalization method “LogNormalize” that normalizes the feature expression
# measurements for each cell by the total expression, multiplies this by a scale factor
# (10,000 by default), and log-transforms the result.
# Normalized values are stored in seurat_obj[["RNA"]]@data.
# identical(seurat_norm@assays$RNA@counts, seurat_norm@assays$RNA@data)
# Split seurat object by condition to perform cell cycle scoring and SCT
seurat_split <- SplitObject(seurat_filtered, split.by = "sample")
for (i in 1:length(seurat_split)) {
seurat_split[[i]] <- NormalizeData(seurat_split[[i]],
verbose = TRUE)
seurat_split[[i]] <- CellCycleScoring(seurat_split[[i]],
g2m.features=m_g2m_genes,
s.features=m_s_genes)
seurat_split[[i]] <- SCTransform(seurat_split[[i]],
vars.to.regress = c("mitoRatio"))
}
#---------- Identification of highly variable features
# calculate a subset of features that exhibit high cell-to-cell variation in the dataset
# (i.e, they are highly expressed in some cells, and lowly expressed in others)
# genes names stored in seurat_norm@assays[["RNA"]]@var.features
# variable features stored in seurat_norm@assays[["RNA"]]@meta.features
#----- Select the most variable features to use for integration
seurat_features <- SelectIntegrationFeatures(object.list = seurat_split,
nfeatures = 2000)
#----- Prepare the SCT list object for integration
seurat_split <- PrepSCTIntegration(object.list = seurat_split,
anchor.features = seurat_features)
#----- modified to use young samples as the reference
# first time used reference=c(9,10,11,12,13,14,15,16)
seurat_anchors <- FindIntegrationAnchors(object.list = seurat_split,
normalization.method = "SCT",
anchor.features = seurat_features)
#---- Integrate across conditions
seurat_integrated <- IntegrateData(anchorset = seurat_anchors,
normalization.method = "SCT")
#---- Run PCA
seurat_integrated <- RunPCA(object = seurat_integrated)
ElbowPlot(object = seurat_integrated,
ndims = 50)
#---- Run UMAP
seurat_integrated <- RunUMAP(seurat_integrated,
dims = 1:40,
reduction = "pca")
#---- save rds
saveRDS(seurat_integrated, file="seurat_results/seurat_integrated.rds")
#---- Remove data
rm(seurat_filtered,seurat_split,seurat_features,seurat_anchors)
#---------- Plot PCA
#----- Load data
seurat_integrated <- readRDS(file = "seurat_results/seurat_integrated.rds")
#----- PCA
PCAPlot(seurat_integrated,
group.by = "sample",
split.by = "group")
#----- Explore heatmap of PCs
DimHeatmap(seurat_integrated,
dims = 1:9,
cells = 500,
balanced = TRUE)
#----- Printing out the most variable genes driving PCs
print(x = seurat_integrated[["pca"]],
dims = 1:9,
nfeatures = 5)
## PC_ 1
## Positive: C1qa, C1qb, C1qc, Cst3, Ctss
## Negative: Plp1, Ptgds, Car2, Cnp, Cryab
## PC_ 2
## Positive: Plp1, Ptgds, Trf, Mbp, Cnp
## Negative: Aldoc, Clu, Atp1a2, Apoe, Gpr37l1
## PC_ 3
## Positive: Cldn5, Bsg, Ly6c1, Ly6a, Flt1
## Negative: Meg3, Snhg11, Atp1b1, Pcp4, Snap25
## PC_ 4
## Positive: Apoe, Aldoc, Cst3, Clu, Mt1
## Negative: Meg3, Snhg11, Atp1b1, Snap25, Bsg
## PC_ 5
## Positive: Atp1b1, Mt1, Gad1, Aldoc, Clu
## Negative: C1ql1, Pdgfra, Olig1, Matn4, Tnr
## PC_ 6
## Positive: Npy, Vtn, Fabp7, Gad1, Frzb
## Negative: Rtn1, Ly6h, Chn1, Cck, Thy1
## PC_ 7
## Positive: Gad1, Meis2, Shisa8, Synpr, Cspg5
## Negative: Vtn, Apod, Chn1, Ptn, Uchl1
## PC_ 8
## Positive: Cd52, Apoe, Fxyd5, Rpl32, Lyz2
## Negative: Vtn, Higd1b, Myl9, Ndufa4l2, Rgs5
## PC_ 9
## Positive: Nbl1, Mgp, Cald1, Higd1b, Ndufa4l2
## Negative: Apod, Fabp7, Npy, Frzb, Cldn5
#----- Plot the elbow plot
ElbowPlot(object = seurat_integrated,
ndims = 50)
#---------- Plot UMAP
DimPlot(seurat_integrated,
reduction = "umap",
split.by = "group")
Seurat uses a graph-based clustering approach, which embeds cells in a graph
structure, using a K-nearest neighbor (KNN) graph (by default), with edges
drawn between cells with similar gene expression patterns.
The resolution is an important argument that sets the “granularity” of the
downstream clustering and will need to be optimized for every individual
experiment. For datasets of 3,000 - 5,000 cells, the resolution set between
0.4-1.4 generally yields good clustering. Increased resolution values lead to a
greater number of clusters, which is often required for larger datasets.
#---------- Clustering_Cells
#----- Determine the K-nearest neighbor graph
seurat_cluster <- FindNeighbors(object = seurat_integrated,
dims = 1:40)
#----- Determine the clusters for various resolutions
seurat_cluster <- FindClusters(object = seurat_cluster,
resolution = c(0.2, 0.4, 0.6, 0.8, 1.0, 1.2))
#---- save rds
saveRDS(seurat_cluster, file="seurat_results/seurat_cluster.rds")
#----- Remove data
rm(seurat_integrated)
#----- Load data
seurat_cluster <- readRDS(file = "seurat_results/seurat_cluster.rds")
#---------- Explore resolutions
#----- Assign identity of clusters
Idents(object = seurat_cluster) <- "integrated_snn_res.0.4"
#----- Plot UMAP
DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE)
UMAP_0.4
Resolution 0.4
#----- Extract identity and sample information from seurat object to
# determine the number of cells per cluster per sample
FetchData(seurat_cluster, vars = c("ident", "sample")) %>%
dplyr::count(ident, sample) %>%
tidyr::spread(ident, n) %>%
kable(caption="Number of Cells",
format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| sample | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Old1 | 270 | 179 | 196 | 181 | 143 | 130 | 197 | 78 | 109 | 187 | 93 | 86 | 45 | 60 | 47 | 16 | 8 | 20 | 7 | 36 | 19 | 18 | 17 | 13 | 19 |
| Old2 | 176 | 142 | 142 | 266 | 79 | 164 | 144 | 37 | 93 | 127 | 83 | 67 | 62 | 50 | 29 | 17 | 4 | 14 | 8 | 17 | 7 | 29 | 18 | 14 | 10 |
| Old3 | 652 | 522 | 560 | 276 | 410 | 560 | 289 | 294 | 286 | 160 | 247 | 159 | 168 | 85 | 84 | 48 | 58 | 30 | 29 | 50 | 50 | 38 | 34 | 32 | 33 |
| Old4 | 729 | 447 | 424 | 273 | 425 | 309 | 292 | 258 | 250 | 217 | 201 | 159 | 135 | 74 | 73 | 54 | 72 | 39 | 28 | 42 | 30 | 40 | 29 | 41 | 27 |
| Old5 | 354 | 252 | 185 | 275 | 207 | 248 | 141 | 109 | 150 | 90 | 112 | 56 | 72 | 33 | 46 | 57 | 65 | 29 | 20 | 40 | 22 | 20 | 19 | 14 | 23 |
| Old6 | 353 | 247 | 195 | 364 | 169 | 176 | 134 | 133 | 107 | 97 | 163 | 66 | 60 | 46 | 33 | 53 | 45 | 33 | 34 | 17 | 17 | 11 | 17 | 21 | 16 |
| Old7 | 419 | 337 | 293 | 262 | 270 | 265 | 212 | 175 | 168 | 57 | 173 | 87 | 62 | 80 | 35 | 20 | 36 | 27 | 12 | 17 | 43 | 22 | 39 | 22 | 24 |
| Old8 | 558 | 396 | 318 | 306 | 317 | 251 | 208 | 193 | 202 | 61 | 246 | 89 | 43 | 51 | 46 | 26 | 32 | 31 | 18 | 35 | 60 | 31 | 29 | 35 | 24 |
| Young1 | 266 | 143 | 82 | 185 | 132 | 132 | 89 | 80 | 39 | 36 | 108 | 27 | 14 | 26 | 16 | 22 | 39 | 20 | 14 | 18 | 13 | 9 | 6 | 24 | 6 |
| Young2 | 263 | 204 | 228 | 106 | 142 | 124 | 103 | 139 | 147 | 170 | 84 | 67 | 18 | 22 | 26 | 15 | 4 | 24 | 45 | 27 | 49 | 29 | 20 | 8 | 18 |
| Young3 | 221 | 198 | 257 | 160 | 121 | 156 | 136 | 152 | 161 | 182 | 101 | 110 | 55 | 43 | 32 | 26 | 21 | 32 | 33 | 25 | 17 | 21 | 16 | 8 | 21 |
| Young4 | 361 | 259 | 258 | 166 | 209 | 150 | 157 | 183 | 159 | 113 | 130 | 101 | 52 | 49 | 29 | 16 | 6 | 36 | 41 | 27 | 31 | 21 | 17 | 22 | 30 |
| Young5 | 237 | 209 | 179 | 189 | 138 | 153 | 124 | 162 | 149 | 189 | 83 | 104 | 68 | 36 | 48 | 23 | 20 | 22 | 38 | 18 | 14 | 31 | 15 | 9 | 18 |
| Young6 | 261 | 160 | 189 | 206 | 142 | 142 | 116 | 159 | 132 | 204 | 129 | 93 | 34 | 27 | 41 | 20 | 13 | 24 | 41 | 14 | 18 | 13 | 19 | 32 | 6 |
| Young7 | 329 | 321 | 289 | 169 | 211 | 167 | 197 | 224 | 198 | 178 | 72 | 145 | 66 | 42 | 50 | 21 | 15 | 30 | 30 | 26 | 15 | 18 | 29 | 14 | 16 |
| Young8 | 369 | 315 | 347 | 125 | 174 | 136 | 203 | 194 | 201 | 178 | 44 | 142 | 108 | 66 | 71 | 20 | 14 | 39 | 36 | 20 | 19 | 29 | 19 | 8 | 14 |
#----- UMAP of cells in each cluster by sample
DimPlot(seurat_cluster,
label = TRUE,
split.by = "group") + NoLegend()
#----- plot
VlnPlot(object = seurat_cluster,
features = c("nGene", "nUMI", "log10GenesPerUMI", "mitoRatio"),
ncol = 2,
pt.size=0,
group.by="integrated_snn_res.0.4")
FeaturePlot(seurat_cluster,
reduction = "umap",
features = c("nUMI", "nGene", "log10GenesPerUMI", "mitoRatio"),
ncol = 2,
pt.size = 0.4,
sort.cell = TRUE,
min.cutoff = 'q10',
label = TRUE)
#----- cell phases
FetchData(seurat_cluster, vars = c("ident", "Phase")) %>%
dplyr::count(ident, Phase) %>%
tidyr::spread(ident, n) %>%
kable(caption="Number of Cells", format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| Phase | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| G1 | 1,320 | 1,764 | 1,825 | 1,540 | 504 | 905 | 964 | 995 | 1,163 | 1,086 | 1,105 | 560 | 258 | 354 | 394 | 238 | 175 | 173 | 78 | 71 | 106 | 138 | 127 | 216 | 34 |
| G2M | 1,818 | 1,132 | 1,191 | 939 | 1,425 | 1,143 | 889 | 616 | 654 | 709 | 579 | 627 | 539 | 237 | 175 | 110 | 106 | 161 | 228 | 226 | 136 | 165 | 109 | 58 | 238 |
| S | 2,680 | 1,435 | 1,126 | 1,030 | 1,360 | 1,215 | 889 | 959 | 734 | 451 | 385 | 371 | 265 | 199 | 137 | 106 | 171 | 116 | 128 | 132 | 182 | 77 | 107 | 43 | 33 |
#----- Explore whether clusters segregate by cell cycle phase
DimPlot(seurat_cluster,
label = TRUE,
split.by = "Phase") + NoLegend()
#----- plot
FeaturePlot(seurat_cluster,
reduction = "umap",
features = c("S.Score", "G2M.Score"),
ncol = 2,
pt.size = 0.4,
sort.cell = TRUE,
min.cutoff = 'q10',
label = TRUE)
#----- Defining the information in the seurat object of interest
# Extracting this data from the seurat object
umap_data <- FetchData(seurat_cluster,
vars = c(paste0("PC_", 1:16),
"ident", "UMAP_1", "UMAP_2"))
#----- Extract the UMAP coordinates for the first 10 cells
#seurat_cluster@reductions$umap@cell.embeddings[1:10, 1:2]
# Adding cluster label to center of cluster on UMAP
umap_label <- FetchData(seurat_cluster,
vars = c("ident", "UMAP_1", "UMAP_2")) %>%
group_by(ident) %>%
summarise(x=mean(UMAP_1), y=mean(UMAP_2))
#----- Plotting a UMAP plot for each of the PCs
map(paste0("PC_", 1:16), function(pc){
ggplot(umap_data,
aes(UMAP_1, UMAP_2)) +
geom_point(aes_string(color=pc),
alpha = 0.7) +
scale_color_gradient(guide = FALSE,
low = "grey90",
high = "blue") +
geom_text(data=umap_label,
aes(label=ident, x, y)) +
ggtitle(pc)
}) %>%
plot_grid(plotlist = .)
#----- Explore heatmap of PCs
DimHeatmap(seurat_cluster,
dims = 1:9,
cells = 500,
balanced = TRUE)
#----- Printing out the most variable genes driving PCs
print(x = seurat_cluster[["pca"]],
dims = 1:9,
nfeatures = 5)
## PC_ 1
## Positive: C1qa, C1qb, C1qc, Cst3, Ctss
## Negative: Plp1, Ptgds, Car2, Cnp, Cryab
## PC_ 2
## Positive: Plp1, Ptgds, Trf, Mbp, Cnp
## Negative: Aldoc, Clu, Atp1a2, Apoe, Gpr37l1
## PC_ 3
## Positive: Cldn5, Bsg, Ly6c1, Ly6a, Flt1
## Negative: Meg3, Snhg11, Atp1b1, Pcp4, Snap25
## PC_ 4
## Positive: Apoe, Aldoc, Cst3, Clu, Mt1
## Negative: Meg3, Snhg11, Atp1b1, Snap25, Bsg
## PC_ 5
## Positive: Atp1b1, Mt1, Gad1, Aldoc, Clu
## Negative: C1ql1, Pdgfra, Olig1, Matn4, Tnr
## PC_ 6
## Positive: Npy, Vtn, Fabp7, Gad1, Frzb
## Negative: Rtn1, Ly6h, Chn1, Cck, Thy1
## PC_ 7
## Positive: Gad1, Meis2, Shisa8, Synpr, Cspg5
## Negative: Vtn, Apod, Chn1, Ptn, Uchl1
## PC_ 8
## Positive: Cd52, Apoe, Fxyd5, Rpl32, Lyz2
## Negative: Vtn, Higd1b, Myl9, Ndufa4l2, Rgs5
## PC_ 9
## Positive: Nbl1, Mgp, Cald1, Higd1b, Ndufa4l2
## Negative: Apod, Fabp7, Npy, Frzb, Cldn5
#---------- Marker_Identification
#----- Select the RNA counts slot to be the default assay
DefaultAssay(seurat_cluster) <- "RNA"
#----- Normalize RNA data for visualization purposes
seurat_markers <- NormalizeData(seurat_cluster, verbose = FALSE)
#---- save rds
saveRDS(seurat_markers, file="seurat_results/seurat_markers.rds")
#---------- Marker_Identification
#----- Load data
seurat_markers <- readRDS(file = "seurat_results/seurat_markers.rds")
#----- remove the x-axis text and tick
# plot.margin to adjust the white space between each plot.
# ... pass any arguments to VlnPlot in Seurat
vlnplot_gg <- function(obj,
feature,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...){
p <- VlnPlot(obj, features = feature, pt.size = pt.size, ... ) +
xlab("") +
ylab(feature) +
ggtitle("") +
theme(legend.position = "none",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = rel(1), angle = 0),
axis.text.y = element_text(size = rel(1)),
plot.margin = plot.margin )
return(p)
}
#----- extract the max value of the y axis
vlnplot_max<- function(p){
ymax<- max(ggplot_build(p)$layout$panel_scales_y[[1]]$range$range)
return(ceiling(ymax))
}
#----- main function
vlnplot_stacked <- function(obj, features,
pt.size = 0,
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"),
...){
plot_list<- purrr::map(features, function(x) vlnplot_gg(obj = obj,
feature = x, ...))
# Add back x-axis title to bottom plot. patchwork is going to support this?
plot_list[[length(plot_list)]] <- plot_list[[length(plot_list)]] +
theme(axis.text.x=element_text(),
axis.ticks.x = element_line())
# change the y-axis tick to only max value
ymaxs <- purrr::map_dbl(plot_list, vlnplot_max)
plot_list <- purrr::map2(plot_list, ymaxs, function(x,y) x +
scale_y_continuous(breaks = c(y)) +
expand_limits(y = y))
p <- patchwork::wrap_plots(plotlist = plot_list, ncol = 1)
return(p)
}
#---------- Oligodendrocyte lineage (and OEG)
#----- OPC: Oligodendrocyte_Precursor_Cells
# Pdgfra Cluster 7
#----- OLG: Oligodendrocytes
# Cldn11 Cluster 0,4,5,20 (21)
#----- OEG: Olfactory_Ensheathing_Glia
# Npy Cluster 12
vlnplot_stacked(obj = seurat_markers,
features = c("Pdgfra", "Cldn11", "Npy"))
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Pdgfra","Cldn11", "Npy"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#---------- Astrocyte lineage (and NSC)
#----- NSC: Neural_Stem_Cells
# Thbs4 Cluster (11topleft)
#----- ARP: Astrocyte_Restricted_Precursors
# Cd44 Cluster (19left)
#----- ASC: Astrocytes
# Gja1 Cluster 2,8,9,11 (12,21,24)
vlnplot_stacked(obj = seurat_markers,
features = c("Thbs4", "Cd44", "Gja1"))
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Thbs4", "Cd44", "Gja1"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#---------- Neuronal lineage
#----- NRP: Neuronal_Restricted_Precursors
# Cdk1 Cluster 7middlebottom
#----- NEUR_immature: Neurons_Immature
# Sox11 Cluster 18left
#----- NEUR_mature: Neurons_Mature
# Syt1 Cluster 3,10,15,23,18right
#----- NendC: Neuroendocrine_Cells
# Baiap3 Cluster (10middle)
vlnplot_stacked(obj = seurat_markers,
features = c("Cdk1", "Sox11", "Syt1", "Baiap3"))
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Cdk1", "Sox11", "Syt1", "Baiap3"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#-------------------- Ependymal cells
#---------- EPC: Ependymocytes
# Ccdc153 Cluster 24top
#---------- HypEPC: Hypendymal_Cells
# Sspo Cluster
#---------- TNC: Tanycytes
# Rax Cluster
#---------- CPC: Choroid_Plexus_Epithelial_Cells
# Ttr Cluster 24bottom
vlnplot_stacked(obj = seurat_markers,
features = c("Ccdc153", "Sspo", "Rax", "Ttr"))
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Ccdc153", "Sspo", "Rax", "Ttr"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#-------------------- Vasculature cells
#---------- EC: Endothelial_Cells
# Cldn5 Cluster 6,13 (12)
#---------- PC: Pericytes
# Kcnj8 Cluster 22 (13)
#---------- Hb-VC: Hemoglobin_Expressing_Vascular_Cells
# Alas2 Cluster
#---------- VSMC: Vascular_Smooth_Muscle_Cells
# Acta2 Cluster (24) (13bottomright)
#---------- VLMC: Vascular_And_Leptomeningeal_Cells
# Slc6a13 Cluster 21bottom
#---------- ABC: Arachnoid_Barrier_Cells
# Slc47a1 Cluster 21top
vlnplot_stacked(obj = seurat_markers,
features = c("Cldn5", "Kcnj8", "Alas2", "Acta2", "Slc6a13", "Slc47a1"))
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Cldn5", "Kcnj8", "Alas2", "Acta2", "Slc6a13", "Slc47a1"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#-------------------- Immune cells
#---------- MG: Microglia
# Tmem119 Cluster 1,14
#---------- MNC: Monocytes
# Plac8 Cluster 19left
#---------- MAC: Macrophages
# Pf4 Cluster 17
#---------- DC: Dendritic_Cells
# Cd209a Cluster 19middle
#---------- NEUT: Neutrophils
# S100a9 Cluster
vlnplot_stacked(obj = seurat_markers,
features = c("Tmem119", "Plac8", "Pf4", "Cd209a", "S100a9"))
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Tmem119", "Plac8", "Pf4", "Cd209a", "S100a9"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#---------- UMAP
DimPlot(seurat_markers, reduction="umap", label = TRUE) + NoLegend()
#---------- Immune_Cells
# http://bio-bigdata.hrbmu.edu.cn/CellMarker/index.jsp
# https://panglaodb.se/index.html
# rownames(seurat_anno@assays$RNA)[grepl(pattern = "Cd1", rownames(seurat_anno@assays$RNA))]
#----- B_Cells
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Cd79b"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#----- Macrophages
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Cd14", "Cd68"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#----- T_Cells
FeaturePlot(seurat_markers,
reduction = "umap",
features = c("Cd4","Trbc2"),
min.cutoff = "q10",
max.cutoff = "q90",
label = TRUE)
#---------- Cluster_Annotation
#----- manually add annotations derived above
seurat_anno <- RenameIdents(seurat_markers,
`0` = "Oligodendrocytes",
`1` = "Microglia",
`2` = "Astrocytes",
`3` = "Neurons_Mature",
`4` = "Oligodendrocytes",
`5` = "Oligodendrocytes",
`6` = "Endothelial_Cells",
`7` = "Oligodendrocyte_Precursor_Cells",
`8` = "Astrocytes",
`9` = "Astrocytes",
`10` = "Neurons_Mature",
`11` = "Astrocytes",
`12` = "Olfactory_Ensheathing_Glia",
`13` = "Endothelial_Cells",
`14` = "Microglia",
`15` = "Neurons_Mature",
`16` = "Neurons_Mature",
`17` = "Macrophages",
`18` = "Neurons_Immature",
`19` = "Monocytes",
`20` = "Oligodendrocytes",
`21` = "Vascular_And_Leptomeningeal_Cells",
`22` = "Pericytes",
`23` = "Neurons_Mature",
`24` = "EPC")
#----- save these annotations in meta.data
seurat_anno@meta.data$cluster_annotation <- Idents(seurat_anno)
#---------- manually edit annotations
DimPlot(seurat_anno, reduction="umap", label = TRUE) + NoLegend()
metadata <- seurat_anno@meta.data
metadata$cluster_annotation <- "Na"
anno <- DimPlot(seurat_anno, reduction="umap", label = TRUE)
anno <- CellSelector(plot = anno)
metadata$cluster_annotation[metadata$cells %in% anno] <- "B_Cells"
seurat_anno@meta.data <- metadata
#----- Assign identity of clusters
Idents(object = seurat_anno) <- "cluster_annotation"
DimPlot(seurat_anno, reduction="umap", label = TRUE) + NoLegend()
#---- save rds
saveRDS(seurat_anno, file="seurat_results/seurat_anno.rds")
#----- Remove data
rm(seurat_markers,anno)
#---------- Cluster_Annotation
#----- Load data
seurat_anno <- readRDS(file = "seurat_results/seurat_anno.rds")
#----- UMAP cell clusters
DimPlot(seurat_anno,
reduction="umap",
label = FALSE)
DimPlot(seurat_anno,
reduction="umap",
label = TRUE,
label.size = 3,
repel = TRUE) + NoLegend()
DimPlot(seurat_anno,
reduction="umap",
label = TRUE,
split.by = "group",
label.size = 3,
repel = TRUE) + NoLegend()
#----- Celltype Counts
table(seurat_anno@meta.data$cluster_annotation)
##
## Arachnoid_Barrier_Cells Astrocytes
## 274 10436
## B_Cells Choroid_Plexus_Epithelial_Cells
## 113 66
## Dendritic_Cells Endothelial_Cells
## 24 3538
## Ependymocytes Macrophages
## 238 457
## Microglia Monocytes
## 5043 143
## Neuronal_Restricted_Precursors Neurons_Immature
## 34 275
## Neurons_Mature Olfactory_Ensheathing_Glia
## 6960 1070
## Oligodendrocyte_Precursor_Cells Oligodendrocytes
## 2535 12836
## Pericytes T_Cells
## 337 149
## Vascular_And_Leptomeningeal_Cells
## 106
#----- may want to ask if we are truely seperating interneurons from excitatory
DotPlot(seurat_anno,
features = c("Pdgfra", # Oligodendrocyte_Precursor_Cells
"Cldn11", # Oligodendrocytes
"Npy", # Olfactory_Ensheathing_Glia
"Thbs4", # Neural_Stem_Cells
"Cd44", # Astrocyte_Restricted_Precursors
"Gja1", # Astrocytes
"Cdk1", # Neuronal_Restricted_Precursors
"Sox11", # Neurons_Immature
"Syt1", # Neurons_Mature
"Baiap3", # Neuroendocrine_Cells
"Ccdc153", # Ependymocytes
"Sspo", # Hypendymal_Cells
"Rax", # Tanycytes
"Ttr", # Choroid_Plexus_Epithelial_Cells
"Cldn5", # Endothelial_Cells
"Kcnj8", # Pericytes
"Alas2", # Hemoglobin_Expressing_Vascular_Cells
"Acta2", # Vascular_Smooth_Muscle_Cells
"Slc6a13", # Vascular_And_Leptomeningeal_Cells
"Slc47a1", # Arachnoid_Barrier_Cells
"Tmem119", # Microglia
"Plac8", # Monocytes
"Pf4", # Macrophages
"Cd209a", # Dendritic_Cells
"S100a9", # Neutrophils
"Cd79b", # B_Cells
"Trbc2"), # T_Cells
cols = c("blue", "red"),
dot.scale = 8) +
RotatedAxis()
# If we want to remove a cell cluster
#seurat_subset <- subset(seurat_integrated, idents = "Endothelial_cells", invert = TRUE)
#---------- Cluster_Cell_Counts
# Cluster_Annotation_Number_of_Cells
seurat_anno@meta.data %>%
dplyr::count(cluster_annotation, sample) %>%
ggplot(aes(x=cluster_annotation, y=n, fill=sample)) +
geom_bar(position="dodge", stat="identity") +
ggtitle("Number of Cells") +
ylab("Cell Count") +
theme_classic() +
facet_wrap(~cluster_annotation, scale="free", ncol = 4)
# Cluster_Annotation_Frequency_of_Cells
seurat_anno@meta.data %>%
dplyr::count(cluster_annotation, sample) %>%
group_by(sample) %>%
mutate(freq = n / sum(n)) %>%
ggplot(aes(x=cluster_annotation, y=freq, fill=sample)) +
geom_bar(position="dodge", stat="identity") +
ggtitle("Frequency of Cells") +
theme_classic() +
#theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
facet_wrap(~cluster_annotation, scale="free", ncol = 4)
Celltype Proportion
Count of Celltype per sample / Count of all Celltypes per sample
#---------- Differential_Cell_Proportions
# beta regression models are designed for data between 0 and 1
# This give me percent differences
# emmeans(betafit,pairwise~Group)
# emmeans(betafit,pairwise~Group,type="response")
# To test that group is important, second way to be pvaule
# betafit1 <- betareg(Oligodendrocytes ~ Group, data = cellproportions)
# betafit2 <- betareg(Oligodendrocytes ~ 1, data = cellproportions)
# lrtest(betafit1,betafit2)
#----- Cell proportions
cellproportions <- seurat_anno@meta.data %>%
dplyr::count(cluster_annotation, sample) %>%
dplyr::group_by(sample) %>%
dplyr::mutate(freq = n / sum(n)) %>%
dplyr::select(-n) %>%
tidyr::spread(cluster_annotation,freq) %>%
replace(is.na(.), 0) %>%
cbind(data.frame(Group = rep(c("Old", "Young"),times=c(8,8))))
#----- Cell Types
cell_types <- seurat_anno@meta.data %>%
arrange(cluster_annotation) %>%
select(cluster_annotation) %>%
unique() %>%
pull(cluster_annotation)
#----- Cell proportion differences
cellproportion_differences <- lapply(cell_types, function(cell_type) {
lmfit <- lm(as.formula(paste(cell_type, "~ Group")), data = cellproportions)
offset <- cellproportions
offset[,cell_type] <- offset[,cell_type] + 0.000001
if (cell_type == "Neuronal_Restricted_Precursors") {
results <- data.frame(Celltype = cell_type,
lm_pvalue = summary(lmfit)$coefficients[2,4],
lm_diff = summary(lmfit)$coefficients[2,1],
betareg_pvalue = NA,
betareg_change = NA)
} else {
betafit <- betareg(as.formula(paste(cell_type, "~ Group")), data = offset)
results <- data.frame(Celltype = cell_type,
lm_pvalue = summary(lmfit)$coefficients[2,4],
lm_diff = summary(lmfit)$coefficients[2,1],
betareg_pvalue = summary(betafit)$coefficients$mean[2,4],
betareg_change = exp(summary(betafit)$coefficients$mean[2,1]))
}
return(results)
}) %>% bind_rows()
#----- Save Results
write_delim(cellproportion_differences,
"seurat_results/Aging_Mice_GSE129788_cellproportion_differences.txt",
delim = "\t")
#---------- Differential_Cell_Proportions_Figures
cellproportion_differences <-
read_delim("seurat_results/Aging_Mice_GSE129788_cellproportion_differences.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
#----- Cell proportions
cellproportions <- seurat_anno@meta.data %>%
dplyr::count(cluster_annotation, sample) %>%
dplyr::group_by(sample) %>%
dplyr::mutate(freq = n / sum(n)) %>%
dplyr::select(-n) %>%
tidyr::spread(cluster_annotation,freq) %>%
replace(is.na(.), 0) %>%
cbind(data.frame(Group = rep(c("Old", "Young"),times=c(8,8))))
#----- Cell Types
cell_types <- seurat_anno@meta.data %>%
arrange(cluster_annotation) %>%
select(cluster_annotation) %>%
unique() %>%
pull(cluster_annotation)
#----- Cell proportions figure
cellproportions %>%
gather(key = "cluster_annotation", value = "freq", 2:(length(cell_types)+1)) %>%
ggplot(aes(x=cluster_annotation, y=freq, fill=Group)) +
geom_boxplot() +
theme_classic() +
scale_fill_brewer(palette="Dark2") +
facet_wrap(~cluster_annotation, scale="free", ncol = 4)
#----- Searchable Table
cellproportion_differences %>%
mutate_if(is.numeric, ~round(.,6)) %>%
DT::datatable()
Identification of all markers for each cluster
This type of analysis is typically recommended for when evaluating a single
sample group/condition. With the FindAllMarkers() function we are comparing each
cluster against all other clusters to identify potential marker genes. The cells
in each cluster are treated as replicates, and essentially a differential
expression analysis is performed with some statistical test. Also, by default
this function will return to you genes that exhibit both positive and negative
expression changes. Typically, we add an argument only.pos to opt for keeping only the positive changes.
Identification of conserved markers in all conditions
Since we have samples representing different conditions in our dataset, our best
option is to find conserved markers. This function internally separates out
cells by sample group/condition, and then performs differential gene expression
testing for a single specified cluster against all other clusters (or a second
cluster, if specified). Gene-level p-values are computed for each condition and
then combined across groups using meta-analysis methods from the MetaDE R
package. Before we start our marker identification we will explicitly set our
default assay, we want to use the original counts and not the integrated data.
DefaultAssay(seurat_anno) <- “RNA”
#---------- Conserved_Markers
# RNA stores the original and normalized counts for finding markers
# DefaultAssay(seurat_anno) <- "RNA"
# Ensembl & Symbol Gene IDs
#---- Gene Names
dir <- "cellranger_mRNA/"
name <- list.files(path = "cellranger_mRNA") %>%
gsub(pattern = "_count", replacement = "")
Gene_Emsembl_Symbol <- read_tsv(paste0(dir,name[[1]],
"_count/outs/filtered_feature_bc_matrix/features.tsv.gz"),
col_names = c("Gene_Emsembl","Gene_Symbol","Assay_type"),
col_types="ccc")[,c(1:2)] %>%
unique()
#---- Cell Types
cell_types <- seurat_anno@meta.data %>%
arrange(cluster_annotation) %>%
select(cluster_annotation) %>%
unique() %>%
pull(cluster_annotation)
#---------- Conserved Markers across sample groups
get_conserved <- function(seurat_rna,cell_type){
FindConservedMarkers(seurat_rna,
ident.1 = cell_type,
grouping.var = "group",
only.pos = TRUE,
min.diff.pct = 0.1,
min.pct = 0.1,
logfc.threshold = 0.25) %>%
rownames_to_column(var = "Gene_Symbol") %>%
left_join(Gene_Emsembl_Symbol, by="Gene_Symbol") %>%
dplyr::mutate(Conserved_Markers = cell_type)
}
Conserved_Markers <- mclapply(cell_types, function(i) {
get_conserved(seurat_rna = seurat_anno,
cell_type = i)
}) %>% bind_rows()
#---- Save Results
write_delim(Conserved_Markers,
"seurat_results/Aging_Mice_GSE129788_Conserved_Markers.txt",
delim = "\t")
#---------- Conserved Marker Pathways
get_gprofiler <- function(markers,ident_name){
genes <- markers %>%
dplyr::filter(Conserved_Markers == ident_name)
gost(query = genes$Gene_Symbol,
organism = "mmusculus",
correction_method = "g_SCS")$result %>%
dplyr::filter(p_value <= 0.05) %>%
dplyr::mutate(Conserved_Markers = ident_name) %>%
dplyr::select(-parents)
}
Conserved_Markers_gprofiler_pathways <- mclapply(cell_types, function(i) {
get_gprofiler(Conserved_Markers,i)
}) %>% bind_rows()
#---- Save Results
write_delim(as.data.frame(Conserved_Markers_gprofiler_pathways),
"seurat_results/Aging_Mice_GSE129788_Conserved_Markers_gprofiler_pathways.txt",
delim = "\t")
#---------- Conserved_Markers_Counts
Conserved_Markers <- read_delim("seurat_results/Aging_Mice_GSE129788_Conserved_Markers.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
#----- Number of Conserved_Markers
Conserved_Markers %>%
dplyr::count(Conserved_Markers) %>%
kable(caption="Number of Conserved_Markers",
format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| Conserved_Markers | n |
|---|---|
| Arachnoid_Barrier_Cells | 1,086 |
| Astrocytes | 440 |
| B_Cells | 255 |
| Choroid_Plexus_Epithelial_Cells | 731 |
| Dendritic_Cells | 552 |
| Endothelial_Cells | 1,203 |
| Ependymocytes | 1,411 |
| Macrophages | 660 |
| Microglia | 623 |
| Monocytes | 547 |
| Neuronal_Restricted_Precursors | 1,322 |
| Neurons_Immature | 635 |
| Neurons_Mature | 925 |
| Olfactory_Ensheathing_Glia | 543 |
| Oligodendrocyte_Precursor_Cells | 597 |
| Oligodendrocytes | 1,000 |
| Pericytes | 319 |
| T_Cells | 399 |
| Vascular_And_Leptomeningeal_Cells | 578 |
#----- Extract top 10 markers per cluster
# Conserved_Markers %>%
# group_by(Conserved_Markers) %>%
# top_n(n = 10,
# wt = max_pval) %>%
# kable(caption="Top10 Conserved_Markers",
# format.args = list(big.mark = ",")) %>%
# kable_styling(c("striped", "bordered"))
#----- Searchable table
Conserved_Markers %>%
mutate_if(is.numeric, ~round(.,4)) %>%
datatable()
#---------- Conserved_Markers_Gprofiler
#----- Read Pathways files
Conserved_Markers_gprofiler_pathways <-
read_delim("seurat_results/Aging_Mice_GSE129788_Conserved_Markers_gprofiler_pathways.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
#----- Conserved_Markers_Gprofiler_Figure
get_gprofilerFig <- function(markers,ident_name){
genes <- markers %>%
filter(Conserved_Markers == ident_name)
gost(query = genes$Gene_Symbol,
organism = "hsapiens",
correction_method = "g_SCS") %>%
gostplot(capped = FALSE, interactive = FALSE)
}
mclapply(cell_types, function(i) {
cat(paste0("#----- ",i))
fig <- get_gprofilerFig(Conserved_Markers,i)
fig$data$query <- i
return(fig)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
#----- Conserved_Markers_Gprofiler_Top10Figure
get_gprofilerTop10 <- function(pathway,ident_name){
pathway %>%
filter(Conserved_Markers == ident_name) %>%
select(Conserved_Markers,
source,
term_id,
term_name,
term_size,
query_size,
intersection_size,
recall,
p_value) %>%
top_n(n = -10, wt = p_value) %>%
ggplot(aes(x=recall,
y=term_name,
colour=p_value,
size=intersection_size)) +
geom_point() +
expand_limits(x=0) +
labs(x="Hits (%)",
y=ident_name,
colour="p value",
size="Count") +
theme_bw()
}
mclapply(cell_types, function(i) {
cat(paste0("#----- ",i))
get_gprofilerTop10(Conserved_Markers_gprofiler_pathways,i)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
#----- Conserved_Markers_Gprofiler_Pathways
Conserved_Markers_gprofiler_pathways %>%
select(Conserved_Markers,
source,
term_id,
term_name,
term_size,
query_size,
intersection_size,
recall,
p_value) %>%
mutate_if(is.numeric, ~round(.,6)) %>%
datatable()
Find DEG between old and young mice per cell_type
Using 0.25 logFC
#---------- Differential_Expression
#----- RNA Assay stores the original and normalized counts
# Original counts : seurat_anno@assays[["RNA"]]@counts
# Normalized counts : seurat_anno@assays[["RNA"]]@data
DefaultAssay(seurat_anno) <- "RNA"
#----- Group Celltype ID
seurat_anno$group_cluster <- paste(seurat_anno$group, seurat_anno$cluster_annotation, sep = "_")
# make this the Idents()
Idents(seurat_anno) <- seurat_anno$group_cluster
#---------- Find Differential Expressed Genes
# get_deg <- function(seurat_rna,ident1,ident2){
# FindMarkers(seurat_rna,
# ident.1 = ident1,
# ident.2 = ident2,
# test.use = "MAST",
# latent.vars = NULL,
# logfc.threshold = 0,
# min.pct = 0,
# min.diff.pct = -Inf,
# min.cells.feature = 10,
# min.cells.group = 10,
# verbose = FALSE) %>%
# rownames_to_column(var = "Gene_Symbol") %>%
# left_join(Gene_Emsembl_Symbol, by="Gene_Symbol") %>%
# mutate(Comparison = paste(ident1, ident2, sep = "_vs_"))
# }
# Differential_Expression <- mclapply(cell_types, function(i) {
# id1 <- paste0("Old_",i)
# id2 <- paste0("Young_",i)
# get_deg(seurat_anno,id1,id2)
# }) %>% bind_rows()
#----- Cell Types
cell_types <- seurat_anno@meta.data %>%
arrange(cluster_annotation) %>%
select(cluster_annotation) %>%
unique() %>%
pull(cluster_annotation)
#----- Remove cell types with less than 10 cell per group
cell_types <- cell_types[!cell_types %in% c("Dendritic_Cells",
"Neuronal_Restricted_Precursors")]
#----- Cell_Type DEG
for (cell_type in cell_types){
print(cell_type)
data <- FindMarkers(seurat_anno,
ident.1 = paste0("Old_",cell_type),
ident.2 = paste0("Young_",cell_type),
test.use = "MAST",
latent.vars = NULL,
logfc.threshold = 0,
min.pct = 0,
min.diff.pct = -Inf,
min.cells.feature = 10,
min.cells.group = 10,
verbose = FALSE) %>%
rownames_to_column(var = "Gene_Symbol") %>%
left_join(Gene_Emsembl_Symbol, by="Gene_Symbol") %>%
mutate(Comparison = paste("Old_vs_", "Young_",cell_type, sep = ""))
assign(paste("Old_vs_", "Young_",cell_type, sep = ""), data)
}
for(i in seq(length(cell_types))){
file <- paste("Old_vs_", "Young_",cell_types[i], sep = "")
print(file)
if(i == 1){
Differential_Expression <- get(file)
} else{
Differential_Expression <- rbind(Differential_Expression,
get(file))
}
}
#----- save the results
write_delim(Differential_Expression,
"seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST.txt",
delim = "\t")
#---------- Differential_Expression_Genes
#---- Read Marker files
Differential_Expression <-
read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
#---- Number of DEGs
merge(Differential_Expression %>%
dplyr::count(Comparison) %>%
dplyr::rename(Total_Genes = n),
Differential_Expression %>%
dplyr::filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 |
p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>%
dplyr::count(Comparison) %>%
dplyr::rename(Genes_FDR0.05_logFC0.25 = n),
by = "Comparison") %>%
kable(caption="Number of Differentially Expressed Genes",
format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| Comparison | Total_Genes | Genes_FDR0.05_logFC0.25 |
|---|---|---|
| Old_vs_Young_Arachnoid_Barrier_Cells | 13,539 | 2 |
| Old_vs_Young_Astrocytes | 15,194 | 59 |
| Old_vs_Young_B_Cells | 9,558 | 6 |
| Old_vs_Young_Endothelial_Cells | 15,329 | 169 |
| Old_vs_Young_Ependymocytes | 14,127 | 24 |
| Old_vs_Young_Macrophages | 13,400 | 6 |
| Old_vs_Young_Microglia | 14,607 | 136 |
| Old_vs_Young_Monocytes | 11,069 | 1 |
| Old_vs_Young_Neurons_Mature | 15,514 | 36 |
| Old_vs_Young_Olfactory_Ensheathing_Glia | 14,707 | 15 |
| Old_vs_Young_Oligodendrocyte_Precursor_Cells | 15,442 | 184 |
| Old_vs_Young_Oligodendrocytes | 14,715 | 88 |
| Old_vs_Young_Pericytes | 13,027 | 1 |
| Old_vs_Young_T_Cells | 11,091 | 14 |
#---- Extract top 10 DEGs per cluster
# Differential_Expression %>%
# group_by(Comparison) %>%
# top_n(n = -10,
# wt = p_val_adj) %>%
# kable(caption="Differentially Expressed Genes",
# format.args = list(big.mark = ",")) %>%
# kable_styling(c("striped", "bordered"))
#---- Searchable table
Differential_Expression %>%
filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 |
p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>%
select(Comparison,
Gene_Symbol,
Gene_Emsembl,
pct.1,
pct.2,
p_val,
p_val_adj,
avg_log2FC) %>%
mutate_if(is.numeric, ~round(.,6)) %>%
datatable()
#---------- Differential_Expression_GSEAranked
#----- GMT File
# edit GMT file cut -f 2 -d$'\t' --complement
gmt_file <- gmtPathways("seurat_results/Mouse_GOBP_AllPathways_no_GO_iea_January_13_2021_symbol_fgsea.gmt")
#----- Comparisons
comparisons = unique(Differential_Expression$Comparison)
#----- GSEAranked
Differential_Expression_GSEAranked <- mclapply(comparisons, function(i) {
rnk_file <- Differential_Expression %>%
filter(Comparison == i) %>%
select(Comparison,
Gene_Symbol,
p_val,
avg_log2FC) %>%
mutate(sign_log10Pvalue = sign(avg_log2FC) * -log10(p_val)) %>%
arrange(sign_log10Pvalue)
rnk_file <- setNames(rnk_file$sign_log10Pvalue, rnk_file$Gene_Symbol)
fgsea_rnk <- fgsea(gmt_file, rnk_file, nperm=1000, minSize=10, maxSize=500)
fgsea_rnk <- fgsea_rnk %>%
mutate(Comparison = i) %>%
select(-leadingEdge)
return(fgsea_rnk)
}) %>% bind_rows()
#----- save the results
write_delim(Differential_Expression_GSEAranked,
"seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_GSEAranked.txt",
delim = "\t")
#---------- Differential_Expression_GSEAranked
#----- GSEAranked
Differential_Expression_GSEAranked <-
read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_GSEAranked.txt", "\t", escape_double = FALSE, trim_ws = TRUE)
#----- GSEAranked Number of Pathways
Differential_Expression_GSEAranked %>%
dplyr::filter(padj <= 0.05) %>%
dplyr::count(Comparison) %>%
dplyr::rename(Pathways_FDR0.05 = n) %>%
kable(caption="Number of Pathways",
format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| Comparison | Pathways_FDR0.05 |
|---|---|
| Old_vs_Young_Oligodendrocyte_Precursor_Cells | 191 |
#----- GSEAranked Searchable table
Differential_Expression_GSEAranked %>%
filter(padj <= 0.05) %>%
arrange(padj) %>%
select(Comparison,
pathway,
pval,
padj,
ES,
NES) %>%
mutate_if(is.numeric, ~round(.,4)) %>%
datatable()
Senescence Pathways
#---------- Differential_Expression_Senescence_Pathways
#---- Differential_Expression_Genes
Differential_Expression <-
read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
#----- Comparisons
comparisons = unique(Differential_Expression$Comparison)
#----- Senescence Pathways
senescence_pathways = list.files("../Senescence/mouse/")
#----- Senescence_Pathways_Enrichment
Senescence_Pathways_Enrichment <-
lapply(comparisons, function(i) {
genes <- Differential_Expression %>%
filter(Comparison == i) %>%
nrow()
genes_sig <- Differential_Expression %>%
filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 |
p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>%
filter(Comparison == i) %>%
nrow()
lapply(senescence_pathways, function(p) {
pathway <- read_delim(paste0("../Senescence/mouse/",p),
"\t", escape_double = FALSE, trim_ws = TRUE)
genes_pathway <- Differential_Expression %>%
filter(Comparison == i) %>%
filter(Gene_Symbol %in% pathway$Gene_Symbol) %>%
nrow()
genes_pathway_sig <- Differential_Expression %>%
filter(p_val_adj <= 0.05 & avg_log2FC >= 0.25 |
p_val_adj <= 0.05 & avg_log2FC <= -0.25) %>%
filter(Comparison == i) %>%
filter(Gene_Symbol %in% pathway$Gene_Symbol) %>%
nrow()
enrichment <- data.frame(rbind(Background=c(genes-genes_sig, genes_sig),
Foreground= c(genes_pathway-genes_pathway_sig, genes_pathway_sig)))
names(enrichment) <- c("Nonsignificant","Significant")
df <- data.frame("Comparison" = i,
"Genes" = genes,
"Sig_Genes" = genes_sig,
"Pathway" = p,
"Pathway_Genes" = nrow(pathway),
"Pathway_Back_Genes" = genes_pathway,
"Pathway_Sig_Genes" = genes_pathway_sig,
"Fisher_Pvalue" = fisher.test(enrichment, alternative="greater")$p.value,
"Fisher_OddsRatio"=fisher.test(enrichment, alternative="greater")$estimate)
return(df)
})
}) %>% bind_rows()
#----- save the results
write_delim(Senescence_Pathways_Enrichment,
"seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_Senescence_Pathways_Enrichment.txt",
delim = "\t")
#---------- Differential_Expression_Senescence_Pathways_Counts
#---- Differential_Expression_Senescence_Pathways
Senescence_Pathways_Enrichment <-
read_delim("seurat_results/Aging_Mice_GSE129788_OldvsYoung_Differential_Expression_MAST_Senescence_Pathways_Enrichment.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
#----- Differential_Expression_Senescence_Pathways Number
Senescence_Pathways_Enrichment %>%
dplyr::filter(Fisher_Pvalue <= 0.05) %>%
dplyr::count(Comparison) %>%
dplyr::rename(Fisher_Pvalue0.05 = n) %>%
kable(caption="Number of Pathways",
format.args = list(big.mark = ",")) %>%
kable_styling(c("striped", "bordered"))
| Comparison | Fisher_Pvalue0.05 |
|---|---|
| Old_vs_Young_Endothelial_Cells | 6 |
| Old_vs_Young_Ependymocytes | 1 |
| Old_vs_Young_Microglia | 1 |
| Old_vs_Young_Olfactory_Ensheathing_Glia | 2 |
| Old_vs_Young_Oligodendrocyte_Precursor_Cells | 3 |
| Old_vs_Young_Oligodendrocytes | 1 |
#----- Differential_Expression_Senescence_Pathways Searchable table
Senescence_Pathways_Enrichment %>%
filter(Fisher_Pvalue <= 0.05) %>%
arrange(Fisher_Pvalue) %>%
mutate(Fisher_Pvalue = round(Fisher_Pvalue, 4),
Fisher_OddsRatio = round(Fisher_OddsRatio, 2)) %>%
datatable()